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ABSTRACT 



A method of fonming a cohort for use in identification of an 
individual by comparing a model of characteristics of the 
individual, such as a model of utterances, with models of the 
cohort including a model for the client in respect of whom 
it is desired to test whether the individual is identifiable. 
Models related to the population excluding the client are 
tested to determine whether they meet an acceptance thresh- 
old test as to identify with a model for the client. Then, from 
each meeting the threshold test, it is determined whether 
those models are distributed so as to present al least a 
substantial probabihty that models for nonmembcrs of the 
population spaced from the client model in all directions will 
each be closer to a member of the cohort, excluding the 
client, than to the client. If that probabihty is less than a 
predetermined value, a selection is made fi:om the popula- 
tion of another cohort member which will reduce that 
probability. Alternatively, if the mentioned probability is less 
than the predetermined value, a "phantom" model is gener- 
ated for inclusion in the population and which will reduce 
that probabihty. The method may employ both the described 
selection and "phantom" generation techniques. 

26 Claims, 8 Drawing Sheets 
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/* - - */ 

/* Codeword Mixer */ 

/* - */ 

/* */ 
_ * 

/* */ 
/* */ 
/* */ 

/* */ 



#include <stdio.h> 
#include <stdlib,h> 
#include <string.h> 
#include <math.h> 
#include "proto.h" 



#define K 15 /* Dimension of vector */ 

#define Nsize 128 /* Codebook size */ 

#def1ne Mnf 12 /* Minimum number of frames */ 

/*.--- */ 

int main(int argc.char *argv[3) 
{ 

/* V 

/* Variables */ 
/* - */ 



FILE nestfile. *wcodefile. *weifile. *stream: 

int Nf .id.i .j.n.il.m.num-O.correct-O. Icode[Nsize]. Ccode[200]: 

float MFCC[Nsize][K].codebookl[Nsize][K]: 

float codebook[Nsize][K], AA[15]: 

f 1 oat di s . f mi n , vq_er ror . durnny . we 1 [K] . percent : 

char filename[30].cbC20].nn[200]. ch[30]: 

/* */ 

/* Read inputs */ 

/* 1 */ 

AA[0>1. 500260: 
AA[1]-L5468I1; 
AA[2>-0. 205271: 
AA[3]=0. 466673; 
AA[4]«0. 535951: 
AAt5M. 017841; 
AA[6>-0. 789567: 
AA[7]=0. 596287: 
AA[8]=-0. 492664: 
AA[9]=0. 135963: 
AA[10]=-0. 180459: 
AA[ll]-0. 224845: 
M[12]=0. 252975: 
AA[13]=.0. 634357: 
AA[14]=0. 379529: 



if(argc < 2) 

{ 

printf{ "Usage; vq testf11e\n"): 

exit(l): 

} 



Fig. 4 A 



04/18/2004, EAST Version: 1.4.1 



U.S. Patent 



Jun. 27, 2000 



Sheet 4 of 8 



6,081,660 



1d=ato1(argvCl]): 
printfC*1d:3;i\nMd): 
f or ( m=2 : fn<a rgc : m<H- ) 
( 

if((testfile = fopen(argv[m]/r")) NULL) 
{ 

printfC "Error opening file Is !\n".argv[m]) : 
continue: 

} 

Nf-filelength(argv[m])/(s1zeof(float)*K); 
spri ntf (f 1 1 ename . "%$=, basename( a rgv[m] ) ) : 

1f(Nf <= Mnf) 
{ 

pri ntf (Tile %s is shorter than %6 frames, not be tested. \n*. filename. Mnf ) : 

fclose(testfne) : 

continue: 

} 

nuiTH-+: 

/* MFCC = float_matr1x-20(Nf.K): */ 
fread((vo1d *)&MFCC[0][0].slzeof (float). Nf*K.testfne): 

fclosG(tGstflle): 

spri ntf (nn. ''wcodeld.l28*'.id): 

1f((wc'odef1le - fopen(nn."r")) — NULL) 
{ 

printf ("Error opening file %s !\n".cb): 
exlt(l); 

} 

f read ( (void * )&codebook [0 ] [ 0] . s 1 zeof ( f 1 oa t ) . Ns 1 ze*K , wcode f 1 1 e ) : 
fclose(wcodeflle) : 

1f((we1f11e - fopen("mfcc.we1".>")) - NULL) 
{ 

pr1ntf(*Error opening file mfccwei !\n"); 
exit(l); 

} 

for(j=0: j<K;j++) 
f scanf (wel f 11 e . "Xf \&we1 [ j] ) : 
fclose(welflle) ; 

/* */ 

/* Calculate quantization errors */ 
/* */ 



vq_error=0.0: 
for(n«0:n<Nf :n++) 

{ 

fmin=0.0: 
lcode[n]-0: 

Fig. 4B 
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for(j=0:j<K:j++) 

fmirn-(MFCC[n][j]-codebook[0][j])*(MFCCCn][j]-codebook[0][j])*wei[j]: 

fmin=sqrt(ftnin)/K: 
for(1i-l:11<Nsize:l1++) 

{ 

dis=0.0: 

for(j=0:j<K:j++) 

d1s+=(MFCC[n][j]-codeboolc[ii][j])*(MFCC[n][j]-codebook[11][j])*wei[j]: 
dis=sqrt(dis)/K: 
if(dis < fmin) 
{ 

fmi n=dis: 
Icode[n]-ii : 

} 

) 

} 

) 

/* for(1i=0;11<Nsize:11++) 
for(j-0:j<15: 

{ 

codebook 1 [ 1 1 ] [ j ]=MFCC[ 1 1 ] [ j ] : 

) 

*/ 

for(j=0;j<15;j++) 
{ 

for ( n=0 : n<Ns1 ze ; n++) 

codebookl[n][j>codebook[n]U]-0.5*AA[j]: 
sprintf (ch , "WwcodeJfd" ,1d) ; 
streani=fopen(ch . "w" ) : 

fwrite(codebookl.sizeof(float) .Nsize*15, stream) ; 
fclose( stream) : 

} 

} 

Fig. 4C 
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/* -*/ 

/* Average_Codeword_Dlfference.c */ 
/* - */ 



#include <stdio.h> 
finclude <stdlib.h> 
#1nclude <string.h> 
#include <niath.h> 
#1nc1ude "proto.h" 



#define K 15 /* Dimension of vector */ 

#define Nsize 128 /* Codebook size */ 

#define Mnf 1 /* Minimum number of frames */ 

/* - - - */ 

int mainCint argc.char *argv[]) 
{ 

/* - */ 

/* Variables */ 
/* ---*/ 



FILE *testfi1e. *wcodefile, ^weifile: 

int Nf .id.i , j.n.ii .AA.m.num-0.correct=0. Icode[Nsize] : 

float dis2[Nsize][K], vq_errorl[Nsi2e].d1s3[K]: 

float **MFCC: 

f 1 oat codebookUNsi ze] [K] . codeboDk[Ns 1 ze] [K] : 

f 1 oat di s . di si . f mi n . vq^error . vq_error2 . dumrny . wei [K] . percent ; 

char filename[30]. cb[20].nn[80]: 



/* - */ 

/* Read inputs */ 

/* -- - */ 

if(argc < 2) 
{ 



printf("U5age; vq testfileVn") ; 
exit(l): 

) 

1d=atoi{argv[l]) ; 
printf("id:X1\nMd): 
f or(m=2 ; m<a rgc : m++ ) 
( 

if((testfile - fopen(argv[m]. V")) — NULL) 
{ 

prlntfCError opening file %s !\n".argv[m]); 
continue; 

} 

Nf=filelength(argv[m])/(si2eof(float)*K): 
sprintf (filename, **3;s\basename(argv[m])); 

if(Nf <= Mnf) 

{ 

printf(T11e %s Is shorter than Id frames, not be tested. Xn'.filename.Mnf): 

fclose(testflle) : 

continue: 

} 

Fig. 5 A 
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num++; 

MFCC = float_matr1x_2D(Nf,)C): 

fread((void *)&MFCC[0][0].si2eof(float).Nf*K,testfile): 
fclose(testfile) ; 

sprintf(nn. Vdisk2/trust2/fangxin/Speech-data/vq&/wcode2fd.l28" . id) : 

if((wcodefile = fopenCnn. "r") ) NULL) 
{ 

printf("Error opening file %$ !\n".nn): 
exit(l): 

) 

fread((void *)&codebookl[0][0].sizeof(float).Nsl2e*K.wcodefile) : 
fclose(wcodeflle) ; 

for(ii=0;i1<Nsize;ii-M-) 
for{j«0:j<K:j++) 

codebook [ 1 i ] [ j ]=codebook 1 [ 1 i ] [ j ] : 

/* */ 

/* Calculate quantization errors */ 
/* */ 

vq_error=0.0: 
for(n«0:n<Nf :n++) 

{ 

fmin-O.O: 

lcode[n]-0; 

for{j-0:j<K;j-H-) 

fniin+=(MFCC[n][j]-codebook[0][j])*(MFCC[n][j]-codebook[0][j]); 

fmin=sqrt(fmin)/K; 

f or(1 i «1 : 1 i <Ns1 ze : 11 ++) 

{ 

d1s=0.0; 

for(j=0:j<K;j++) 

d1s+=(MFCC[n][j]-codebook[1i][j])*(MFCC[n][j]-codebook[1i][j]): 
d1s=sqrt(d1s)/K; 
1f(d1s < fmin) 
{ fm1n=dis: 

Icode[n]=i1 ; /* Identify the corresponding codeword */ 

} 

) 

disl=0.0: 
for(j=0:j<K:j++) 
{ 

d1sl+=(MFCC[n][j]-codebook[Icode[n]][J])*(MFCC[n][j]-codebook[Icode[n]][j 
dis2[n][j>{MFCC[n]U]-codebook[Icode[n]][j]); 

] 

disl=sqrt(disl)/K: 



Fig, 5B 
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/* printfC'Cword: %6 Iword: Xd UD: %f \n". Icodetn]. n. disl): */ 
vq_error+=fniin: 

} 

vq_error/=(float)Nf : 

/* printfCXs: lf\n" .argv[m] . vq_error): */ 
} 

/* for (j=0: j<K: j>+) 
( 

for(n=0; n<Nsize; n++) 
vq_errorl[ j3+=dis2[n][j] : 
vq_errorl[j]=vq_errorl[j]/(float)Nsize: 
prTntf(" :^d th code DD %f\r\\ J. vq_errorl[j]) ; 
vq_error2+=(vq_errorl[j]>*(vq_errorl[j]) : 

} 

*/ 

for (j=0: j<K: j-n-) 
( 

for (n=0; n<Nsize: n++) 
dis3[j]+-dis2[n][j]: 
dis3[j]=dis3[j]/128: 
pr1ntf("X2.6f\n d1s3[j]): 

} 

/* for (n=0; n<Nsi2e; n-n-) 
{ 

for (j-0: j<K; 
printfCX2.6f \ dis2[n][j]): 
printfCVn"): 

} 



vq_error2-sqrt ( vq_error2 ) /K : 
printf(" DD++: ^f\n" . vq_error2): 



Fig, 5C 
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METHOD FOR FORMING A COHORT FOR 
USE IN IDENTIFICATION OF AN 
INDIVIDUAL 

TECHNICAL FIELD 

This invention relates to a method for forming a cohort for 
use in identification of an individual, and to a method of 
identification of an individual on the basis of that cohort. The 
method is concerned primarily, but not exclusively, with 
forming a cohort for use in identification of individuals on 
the basis of the degree of conformity of characteristics of 
voice sounds, but may be applied to identification on the 
basis of other characteristics of individuals. 

BACKGROUND OF THE INVENTION 

In determining whether an individual is or is not a 
particular pre -identified individual ie a "client", comparison 
may be made as between pre-determined parameters relating 
to the pre-determined person and those measured when any 20 
individual is presented for verification. Particular parameters 
which may be used include parameters relating to speech, 
although parameters relating to other characteristics may be 
used. Among those other characteristics are parameters 
relating to how the presenting individual writes, uses a 25 
computer mouse, or uses a computer or other keyboard. 

One method of identification, or verification, of whether 
or not an individual presenting for verification is or is not a 
pre-determined individual makes use of client models rep- 
resenting each of a population of individuals. Characteristics 30 
relating to a person presenting for verification are measured 
and compared with the characteristics for one or more of the 
total population. If the characteristics for the person pre- 
senting for verification match those for a particular one of 
the population, then the verification system makes a deter- 35 
mination that the presenting person is the particular indi- 
vidual for which the characteristics match. A difficulty with 
systems of this kind is that values for characteristics for any 
person presenting may differ from reference values for that 
person which are used by the system. For example, the 40 
values for characteristics used by the system would normally 
comprise stored values measured in a previous test on the 
individual, the stored value then being compared with those 
measured when the person presents for verification. 
However, naturally occurring variations may exist as 45 
between those values stored and those which arise when a 
verification procedure is carried out. In the case of verifi- 
cation on the basis of characteristics relating to utterances of 
a person, those variations may, for example, comprise pho- 
netic variations, variations due to environmental conditions 50 
and intra speaker variations. Thus, a person may utter a 
vowel in one fashion when the vowel appears in one word, 
and in a different fashion when it appears in another word. 
Again, the test conditions under which the original charac- 
teristic values were determined may be noise free, but there ss 
may be noise present in the environment when the individual 
presents for verification. Generally, the, it is not surely 
possible to effect identification simply on the basis of direct 
equalability of measured characteristics with those stored for 
the individual in question. Normally, comparison is effected 60 
as between characteristic values for more than one of the 
population, the determination of identity being made on the 
basis of the "distance" between the characteristics as stored 
for more than one of the population and those measured at 
verification. The characteristics which are measured in the 65 
verification process may be multi dimensional. For example, 
it has been found convenient to use cepstral analysis tech- 



niques to analyse the speech of a population and the person 
presenting for verification. Overlapping samples of, say, 30 
millisecond may be taken of the amplitude-time wave form 
recorded during speech. In this case, it is convenient to 
generate 15 cepstral coefficients and to generate a model 
representing each member of the population and of the 
person presenting for verification, the models being 15 
dimensional and with, for example, 128 points. The set of 
such points is commonly referred to as a code book for the 
person in question. 

In the comparison of the code book of the person pre- 
senting for verification and those for the reference popula- 
tion employed by the verification technique, one may choose 
from the code books for the population code books of a 
"cohort", being a limited number of the population, and then 
compare the code book of the presenting person with codes 
books for that cohort. The cohort is selected from the total 
population on the basis that there is some similarity between 
the code book for the "client" in the population (ie the 
person whom the person presenting for verification purports 
to be) and the relevant cohort members. Selection of the 
cohort members can be made on the basis of the proximity 
of the centroids of the code book distributions to the centroid 
of the chcnt's code book distributions. It is important that 
the multi-dimensional (Euclidean) distance between the 
centroid for the client and the various cohort members be 
significant, but not too great. 

While methods based on the above have been found to be 
workable, hitherto inexplicable errors sometimes arise. For 
example, an error as basic as faUure to discriminate between 
a male and a female voice may occur. It has now been 
determined that a likely cause of this difficulty is that the 
cohorts which are selected for the particular client do not 
have code book distributions which "surround" the code 
book distributions for the client in a satisfactory fashion. In 
particular, if the distance from the centroid of the code book 
distributions for the person presenting for verification to the 
client code book distribution centroid is great, then the 
difference between the distance to the centroids of the code 
book distributions for the client and for other cohort mem- 
bers will be relatively small. It may easily arise in this case 
that, because of the diistribution of the cohort members with 
respect to the client, the distance between the code book 
distribution centroids of the client and of the person pre- 
senting for verification is less than the distance from the 
code book distributions centroid for the person presenting 
for verification than any of the other cohort members, at 
least as applies to some particular direction as between the 
code book distribution centroids for the person presendy for 
verification and for the cUent and cohorts. Thus, the verifi- 
cation scheme may incorrectly assume that the person 
presenting for verification is the client in this instance. 
Merely increasing the number of cohorts will not necessarily 
rectify this problem. 

SUMMARY OF THE INVENTION 

In accordance with the present invention, the "coverage" 
is extended by 

a) selecting appropriate new cohort members from the 
population, and/or 

b) generating from data relating to existing cohort 
members, including or excluding a particular client, a 
model for inclusion in the cohort. 

More particularly, embodiments of the invention provide 
methods for synthesising speech models for "phantom" 
speakers with specified speech characteristics, comprising: 
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(i) for determining the desired characteristics for each 
successive cohort member during incremental assem- 
bly of a cohort; and/or 

(ii) constructing synthetic speech models with the desired 
characteristics. 

The synthesised models may be formed from combina- 
tions of real speech models. For example, speech events fall 
into several different classes (volcalic, fricative, nasal, etc.); 
during the synthesis procedure, those parts of the real speech 
models pertaining to different classes of speech events may 
be considered separately. As a result of their method of 
composition, the synthesised speech models may be repre- 
sentative of possible real speakers. 

In one specific aspect, the invention comprises a method 
of assembling a cohort for a client being one of a population, 
comprising testing whether models of at least a substantial 
number (preferably all) of the population excluding the 
client meet an acceptance threshold test as to identity with 
a model for the client, determining, from each model meet- 
ing the threshold test, whether those models are distributed 
so as to present at least a substantial probability that models 
for non-members of the population spaced from the client 
model in all directions will each be closer to a member of the 
cohort, excluding the client, than to the client and, if that 
probability is less than a predetermined value, selecting 
from the population another cohort member which will 
reduce that probability. 

In another aspect, the invention provides a method of 
assembling a cohort for a client, being one of a population, 
comprising testing whether models of at least a substantial 
number (preferably aU) of the population excluding the 
client meet an acceptance threshold test as to identity with 
a model for the client, determining, from each model meet- 
ing the threshold test, whether those models are distributed 
so as to present at least a substantial probability that models 
for non-members of the population spaced from the client 
model in all directions will each be closer to a member of the 
cohort, excluding the client, than to the client and, if that 
probability is less than a predetermined value, gcQcrating a 
new model for inclusion in the population and which will 
reduce that probability. 

In another aspect the invention provides a method of 
assembling a cohort for a client, being one of a population, 
comprising testing whether models of at least a substantial 
number (preferably aQ) of the population excluding the 
client meet an acceptance threshold test as to identify with 
a model for the client, determining, from the or each meeting 
the threshold test, whether those models are distributed so as 
to present at least a substantial probability that models for 
non-members of the population spaced from the client 
model in all directions will each be closer to a member of the 
cohort, excluding the client, than to the client and, if that 
probability is less than a predetermined value, either select- 
ing from the population another cohort member which will 
reduce that probability or generating a new model for 
inclusion in the population and which will reduce that 
probability. 

The invention also provides a method of verification using 
a cohort assembled as above described. 

The invention may be practiced with models of different 
types, for example vector quantisation or hidden Markov 
models. 

BRIEF DESCRIPTION OF THE DRAWINGS 

FIG. 1 is a graph plot illustrating analysis of residuals 
with similar and dissimilar codebooks. 

FIG. 2 is a diagram depicting a hyperellipsoid conceotric 
with a client containing centroids of codebooks for speakers 
similar to the client. 
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RG. 3 is a table illustrating Achieved Equal Error Rate 
percentages. 

FIGS. 4A, 4B, and 4C form a listing of a "C computer 
program suitable for finding the average vector distance 
5 between sets of paired cohorts. 

FIGS. 5A, 5B, and 5C form a listing of a computer 
program suitable for synthesizing codebook distributions for 
a "phantom" population member. 

10 DETAILED DESCRIPTION OF THE 

INVENTION 

The following detailed description describes in more 
detail the context of the invention, and preferred features of 
15 the invention. 

The *'cohort normalised" method of speaker verification 
computes for each input utterance its relative distance from 
models of the cUent and a cohort of speakers drawn from the 
same population. It is assumed that variations which reduce 
20 the utterance fit to the client model will tend to have similar 
effects with respect to the cohort speaker models. The use of 
"relative distance" can lead to improved client/impostor 
discrimination. 

The following relates to the design of suitable cohorts. 
^ Using VQ codebooks in multidimensional cepstral space as 
the basic speaker models, pairs of codebooks can be related 
geometrically in terms of vector differences between their 
centroids in cepstral space. In a weU-designed cohort, the 
cohort members give adequate "coverage" of the client's 
codebook in multidimensional space. 

Cohort members are usually chosen on the basis of their 
similarity to the client. Experiments in which cohort mem- 
bers were instead chosen according to their position relative 
to the client led to a slight improvement in verification 
performance, suggesting that joint consideration of similar- 
ity and position would give even better results. However, 
with a limited set of speakers, it will often be difficult to find 
cohort members who meet these simultaneous requirements. 
At least in certain cases it is possible to synthcsisc suitable 
"phantom" codebooks based on those of real speakers. 

In the classic procedure for speaker verification, an input 
utterance is accepted or rejected according to a threshold on 
its goodness of fit with a model of the client's speech. While 
45 such a measure truly reflects absolute deviations between the 
client's model and input utterances, it is sensitive to over- 
lapping client and impostor distributions which arise 
because of the effects of intra-speaker variation, recording 
environment change and phonetic variation. This in turn 
5Q leads to a high Equal Error Rate (EER). 

An alternative approach (Rosenberg et al., 1992) uses a 
"cohort" of speakers, with speech models similar to that of 
the client, allowing relative measures of similarity or dif- 
ference to be computed and reducing problems due to the 
55 above-mentioned variations. Similarity is judged on the 
basis of the mean distortion of a potential cohort speaker's 
utterances with respect to the client speaker's VQ model. 

Tests of the cohort method show that it is subject to 
problems with false acceptance of impostor utterances 
60 which arc quite dissimilar to those of the client (eg. from a 
speaker of opposite sex to the client) but which still give a 
better fit to the client model than to any of the cohort models. 
A tentative geometrical explanation of this problem has been 
given in Chen, F., Millar, B. and Wagner, M. (1994), 
65 "Hybrid threshold approach in text -independent speaker 
verification," Proc. Int, Conf. on Spoken Language 
Processing, Yokohama, 1855-1858, suggesting that the 
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problem arises from inadequate "coverage" of the client by 
cohort members. Thus, a significant practical difficulty asso- 
ciated with use of the cohort-normalised method is that of 
assembling a suitable cohort from among the set of indi- 
viduals whose speech has been modelled. In many cases, 
this set will be too small and for certain clients will not 
include a suitable set of speakers with similar speech models 
from ^^^ich to assemble a cohort. Choice of suitable cohort 
members needs to be based on an understanding of the 
relationship between pairs of codcbooks. Unless suitable 
potential cohort members are available and the cohort mem- 
bers are selected carefully, anomalous verification behaviour 
may result (e.g. an impostor of the opposite sex being 
verified as the cUent). Verification performance tends to 
improve with cohort size, but this increases verification 
time. By appropriate choice of cohort members, one can 
form a cohort of minimum size for a specified level of 
performance. 

The techniques covered of the present invention directly 
address practical difficulties associated with assembling a 
suitable cohort for each client in the absence of a large set 
of speech models from which to select cohort members. 
Speakers may, in the following, be considered to be char- 
acterised by codebooks of 128 codewords (vectors in 15-D 
mel-frequency cepstral space) chosen such as to minimise 
the encoding error (distortion) with the training data sets. 
The number of muscle groups used in articulating speech 
sounds is much less than 15. Most of the relevant informa- 
tion for phonetic discrimination in the speech of two males 
can be represented with about six cepstral coefficients Davis, 
S. B. and Mermelstein, P. (1980), "Comparison of paramet- 
ric representations for monosyllabic word recognition in 
continuously spoken sentences," IEEE Trans, Acoustics, 
Speech and Signal Processing, Vol. ASSP-28, 357-366. 
High-dimensional cepstral data relating to vocalic speech 
tends to fall on low-dimensional quadratic surfaces 
(predominantly parabolic) which can be characterised in 
terms of only four parameters Hawkins, S., Macleod, I. and 
Millar, B. (1994), "Modelling individual speaker character- 
istics by describing a speaker's vowel distribution in 
articulatory, cepstral and formant space," Proc. Int. Corf, on 
Speech Science and Technology, Perth. Important compo- 
nents of the codeword distributions will thus have lower 
intrinsic dimensionality than that of their space of represen- 
tation; the overall distributions can thus be expected to show 
significant clustering in cepstral space. 

The similarity of a pair of codebooks may be assessed by 
measuring the distortion when one codebook is used to 
encode the speech data on which it was trained, and then to 
compare this to the distortion obtained with this data using 
the other codebook. Given that the codebooks for all leak- 
ers have been trained on the same set of utterances, the 
regions most densely occupied by codewords should be 
similar for pairs of similar codebooks. 

The similarity of codebooks measured in such a way 
represents the similarity of speakers. The ratio of distortions 
is a scalar magnitude; as a directionless quantity it thus gives 
no indication as to which of two given codebooks would 
yield the smaller distortion when encoding the training data 
for a third, for example. As a similarity measure it provides 
an estimate of how "close" the regions of cepstral space 
occupied by the two codebooks are, but it does not indicate 
their relative positions. Scalar measures are thus of only 
hmitcd use in diagnosing problems with a given cohort or in 
choosing cohort members for a given client 

A simple vector measure considers the relative differences 
between codebook centroids (formed from the average of all 
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vectors in a codebook). While pairs of speaker models which 
give relatively small errors in encoding each other's training 
data will have similar distributions of codewords in cepstral 
space, with pairs of similar codebooks there may still be 

5 considerable interspersion of codewords. The question 
arises as to whether any differences (in magnitude and 
direction) between the centroids of such codebooks are 
meaningful in the statistical sense. Given the inhomogeneity 
and complexity of the codeword distributions in cepstral 
space, simple statistical characterisations (based on vari- 
ances of these distributions) are not appropriate for answer- 
ing this question. An alternative method associates the 
codewords in one book with neighbours in the other (eg, on 
the basis of closest Euclidean distance, as used in the 
following) and then analyses the distributional properties of 
the resulting set of 128 difference vectors, to see if they 
cluster in particular directions. A method for analysing these 
properties is next described. 

A method to test the statistical significance of vectorial 
relationships between codebooks is now developed. The 

20 analysis of the difference vectors comprises the following 
steps: 

(i) determine a mean directional component; 

(ii) test the statistical significance of this component; and 

(iii) subtract the mean from all vectors before analysing 
25 the residuals with Principal Components Analysis 

(PCA). 

The mean vector between codeword pairs can simply be 
shown to be equal to the vector between the corresponding 
codebook centroids. PCA is used to check to what extent 

30 directional variability between codeword pairs is concen- 
trated in a few directions. 

Distributions of difference vectors with relatively similar 
and dissimilar pairs of codebooks have been analyses in 
accordance with the principles of this invention (with simi- 

35 larity being assessed in terms of average distortion), using 
Hotelling's 1^ statistic to test the hypothesis that the mean 
vector of the difference vectors was non-zero. For aU pairs 
of codebooks examined, the hypothesis was confirmed 
(p<0.0001) showing that the difference vectors tend to point 

40 in a consistent direction. As a result of the low intrinsic 
dimensionality of the cepstral distributions of vocalic speech 
(Hawkins, Macleod and Millar, 1994), a significant propor- 
tion of the codewords will tend to cluster on hypersurfaces 
of lower dimensionality. If, however, there was substantial 

45 interspersion of the codeword distributions in the codebooks 
being compared, the difference vectors woxild have a less 
consistent orientation. The results of the analysis performed 
in this embodiment of the invention show that the degree of 
interspersion is limited, thus indicating that distributions of 

50 codewords from similar codebooks have similar shapes and 
that the concept of relative displacements between codebook 
pairs has statistical validity. After subtracting the mean 
vector from each difference vector, analysis of the residuals 
with PCA revealed one distinct non-noise directional com- 

55 ponent with dissimilar pairs of codebooks and two orthogo- 
nal components with similar codebook pairs (one compo- 
nent being somewhat larger than the other). This is shown in 
FIG. 1 which illustrates analysis of residuals with similar 
and dissimilar codebooks. 

60 The presence of non-noise Principal Components in the 
residuals, after the mean vector is subtracted, means that 
there arc further systematic variations in the relationships 
between pairs of codeword distributions in addition to the 
mean displacement. Two codebooks with similar centroids 

65 may thus give large distortions when encoding each other's 
training data (eg, if one codebook had a greater span in 
certain directions than the other). 
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An estimate of progress towards explaining the total 
relationship between two codebooks is obtainable by com- 
puting the length of the (vector) sum of the difference 
vectors and comparing this to the sum of the scalar lengths 
of the individual vectors. If all difference vectors point in the 5 
same direction, these two lengths will be the same. If the 
difference vectors are randomly oriented, the summed vector 
length will be only a small fraction of the scalar sum of 
lengths. On examining the codebooks of potential cohort 
members in relation to given client codebooks, it was found 
that this length ratio varied from about 25% to 40%, a much 
larger than expected length for the sum of random vectors. 
In addition to supporting the statistical finding that the 
difference between codebook centroids is real, this result 
means that a large enough component of the total relation- 
ship is captured that clear benefits should follow from taking 
relative codebook positions into account when constructing 
cohorts. 

The above provides statistical justification for using rela- 
tive centroid positions to consider the extent to which the 
members of a cohort "enclose" a client or leave "gaps" in the 20 
coverage, given possible interspersion of the codeword 
distributions of similar speakers. The mioimum distortion 
among the cohort models and the mean distortion across 
cohort models have both been proposed for use in the 
client/cohort comparison. The following tests are based on 25 
use of the min statistic. 

An optimal cohort is one in which for each potential 
impostor there is a cohort member whose codebook encodes 
impostor utterances with lower distortion than that achieved 
with the client's codebook. Care is needed not to falsely 30 
reject the cfient's speech, so such cohort codebooks need to 
encode the client's training data with a significantly (but not 
dramatically) larger distortion than that obtained with the 
client's codebook: The cohort members should be similar, 
but not too similar, to the client. A percentage of impostors 35 
with speech very similar to the client will thus be falsely 
accepted, but this is unavoidable. Referring to FIG. 2, 
imagine a hyperellipsoid (concentric with the client), which 
contains the centroids of codebooks for speakers similar to 
the client. The members of one potential cohort could then 40 
be distributed on the surface of a second larger hyperellip- 
soid with roughly twice the diameters of the first, so that (on 
average) utterances made by speakers whose codebook 
centroids lay outside the first hyperellipsoid would be attrib- 
uted to a cohort member, and utterances made by speakers 45 
whose codebook centroids lay inside would be attributed to 
the client. By varying the size of the smaller hypercUipsoid, 
achieve the desired balance between Type I and Type II 
errors can be achieved. (Hyperellipsoids are advanced here 
instead of hyperspheres, because of the fact that other so 
codebooks are unlikely to be evenly distributed about the 
client's.) 

In the usual case, only a limited set of speakers (and their 
trained codebooks) will be available for cohort construction. 
The most similar speakers in this set to a given client may 55 
well be less (or sometimes more) similar than desired. 
Nevertheless, a functional cohort of size Ncan be formed by 
choosing the N most similar codebooks. Just as the code- 
word distributions themselves will be of lower intrinsic 
dimensionality than that of the representation space, it might 60 
be expected that the relative po.sitions of codebook centroids 
(and thus of cohort members) will also be unevenly distrib- 
uted. For example, cepstral features will tend to vary in a 
systematic manner with changes in parameters such as vocal 
tract length and shape. 65 

In terms of geometric analogy, anomalous acceptance of 
dissimilar impostors with a cohort chosen from the speakers 



most similar to the client arises because the client is "cov- 
ered" too sparsely or too unevenly. An altemative procedure 
for assembling a cohort is as follows. Choose a speaker who 
is similar (but not too similar) to the client as the first cohort 
member. Test the remaining speaker population to see which 
speaker (of about the desired similarity to the client) gives 
the highest percentage of false acceptances with this cohort 
of size one. This speaker will lie in a direction which is not 
well covered by the first cohort member and is chosen as the 
second cohort member. The procedure is repeated until a 
cohort of the required size has been formed. 

Speech data useful in practicing the invention is described 
in Millar, B., Chen, F., Macleod, I., Ran, S., Tang, H., 
Wagner, M. and Zhu, X. (1994), "Overview of speaker 
verification studies towards technology for robust user- 
conscious secure transactions," Proc. Im, Conf, on Speech 
Science and Technology, Perth. The popxilation of 45 speak- 
ers is divided into two — a cohort formation population of 25 
speakers and a client/test population of 20 speakers (10 male 
and 10 female). Using the method of assessment outlined in 
Millar, B., Chen, F. and Wagner, M. (1994), "The efficacy of 
cohort normahsatioo in a speaker verification task under 
different types of speech signal variance,'* Proc. Int. Conf. 
on Speech Science and Technology, Perth, a test was made 
of the verification performance of cohorts assembled (i) 
from the speakers most similar to the client, and (ii) by 
starting with the most similar speaker to the client, adding 
the speaker who gave the greatest number of false accep- 
tances with this cohort of size one, and so on as each new 
member was added. Because of the limited speaker popu- 
lation available, the similarity of cohort members to the 
client was not considered in building up the cohort using the 
"optimum direction" method (which was intended to iden- 
tify and then fill gaps in the cohort coverage of the chent). 
The results given in Table 1 show a shght advantage for the 
direction method, even though (apart from the first cohort 
member) similarity to the chent was not considered. For 
several clients, the EER with the "optimum direction" 
procedure increased slightly as the cohort size increased 
from three to five; in this case the final one or two cohort 
members chosen must have led to false rejections of the 
client (ie. these members were too similar to the client). 

Analysis of the EERs achieved with Min5 and Sel5 
showed that the observed improvement with Sel5 was not 
statistically significant. Thus these experiments indicate that 
the direction and similarity methods produce cohorts of 
similar quality. Given the different basis of these two meth- 
ods of assembling cohorts, simultaneous consideration of 
both coverage and similarity may improve overall perfor- 
mance. 

Given the difficulties encoimtered with locating suitable 
cohort members (because of the limited population of 
speakers), the question arises as to whether it is possible to 
form synthetic codebooks with the desired properties. For 
example, it would be possible to modify the client's code- 
book to get a new codebook which is just sufficiently 
dissimilar (ie. gives the desired amount of distortion when 
encoding the client's training data with respect to the 
balance of Type I and Type 11 errors). For example, it would 
be possible to disturb 1 or more of the 15 coefficients in each 
codeword at a time to yield synthetic cohorts displaced a 
desired distance from the client in the direction of the altered 
coefficients. Experiments showed that codebooks synthe- 
sised in this manner had fit tie practical utility — they usually 
did not encode impostor utterances as efficiently as the 
client's codebook and thtis did not lead to improvements in 
speaker verification performance. The source of the problem 
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here is the use of codeword distributions which are most 
likely densely clustered in only a small region of the 15-D 
cepstral space. la synthesising "phantom" codebooks we 
need to ensure that the synthetic codewords are representa- 
tive of those of typical speakers similar to the client. 5 
Working in a space which is known to be inhomogeneotisly 
occupied, we can minimise errors arising from inhomoge- 
neities by using codeword pairs from similar real speakers 
and interpolating synthesised values, thereby staying "close" 
to known real values. ^ 

Experiments in synthesising codebooks by either adding 
or subtracting a fixed vector displacement to or from all 
codewords in a real speaker's codebook, either the client's 
or a (potential) cohort member's, were instructive. The fixed 
displacement was usually 50% of the difference vector 
between the client's and cohort's codebook centroids. In a 
typical example, the client's codebook encoded a set of test 
chent utterances with a distortion of 2783, the cohort's 
codebook gave a distortion of 3323, the client's codebook 
displaced by either + or -50% of the difference vector 
between the centroids gave distortions of 2811 and 2799 20 
respectively, and the cohort's codebook displaced by + or 
-50% of this difference vector gave distortions of 3422 and 
3255 respectively. Two points to be noted here are that (i) the 
observed increases and decreases in distortion are consistent 
with our geometric interpretation, and (ii) when the client 25 
codebook is displaced halfway towards the cohort, the 
distortion increases but is still substantially smaller than the 
(reduced) distortion obtained when the cohort codebook is 
displaced halfway towards the client. 

The second point above provides further evidence that the 30 
distributions of codewords vary in ways other than overall 
position — speakers are characterised by the shapes of their 
codeword distributions as well. A second method of inter- 
polation was thus tried, which aimed to indirectly capture 
something of these other dimensions of variation. Instead of 35 
adding a fixed vector displacement to all codewords, inter- 
polation (or extrapolation) was affected on the basis of 
individual difference vectors between codeword pairs. As an 
increasing percentage of these difference vectors are added 
to the codewords in the client codebook, so the synthesised 40 
codebook will gradually change from one that is similar to 
the client codebook into one that is similar to the cohort 
codebook. For the example client and cohort codebooks 
considered above, a synthetic codebook interpolated using 
50% of the individual difference vectors for codeword pairs 45 
gave a distortion of 3078, which was close to halfway (3053) 
between the respective client and cohort distortions of 2784 
and 3323. 

FIG. 3 illustrates Achieved Equal Error Rate percentages 
with an absolute threshold (ABS__VQ) and with selected 50 
cohorts of size n chosen conventionally (Minn) and accord- 
ing to false acceptances (Seln). The final column (Sel5') 
shows the improved results obtained with several clients 
(marked with *) through use of a final synthetic cohort 
member. This demonstrates that with some clients the EER 55 
increased from Sel4 to Sel5. In these cases, the chosen fifth 
cohort member was used to construct an extrapolated syn- 
thetic codebook (moving the chosen cohort codebook fur- 
ther away from the client) and recalculated the EER (shown 
as Sel5*). In all cases this procedure prevented the EER from 60 
increasing between Sel4 and Sel5'; in two cases (clients 19 
and 20) the synthetic cohort member reduced the EER 
between Sel4 and Sel5'. The reduction in the overall error 
rate to 2.83% was not, however, sufficient to make the 
difference between Min5 and Sel5' statistically significant. 65 

The overall results of the experiments provide evidence 
that the distributions of codewords in 15-D MFCC space are 
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rather complex. Although it can be shown statistically that 
the observed mean displacements between similar code- 
books are real and do not occur just by chance, the distri- 
butions of codewords in given codebooks will vary in shape 
and extent as well as position. The present concept of 
relative codebook positions captures an important part, but 
only a part, of the total relationship between similar code- 
books. 

The listing shown in FIGS. 4A-4C is for a "C computer 
program suitable for finding the average vector distance 
between sets of paired cohorts. 

The program listing shown in FIGS. 5A-5C is for a "C" 
program suitable for synthesizing codebook distributions for 
a "phantom" population member. 

We claim: 

1. A method of assembling a cohort for a client being one 
of a population, comprising testing whether models related 
to the population excluding the client meet an acceptance 
threshold test as to identity with a model for the client, 
determining, from each model meeting the threshold test, 
whether those models are distributed so as to present at least 
a given probabihty that models for non-members of the 
population spaced from the client model in all directions will 
each be closer to a member of the cohort, excluding the 
client, than to the client and, if that probability is less than 
a predetermined value, selecting from the population 
another cohort member which will reduce that probability. 

2. The method as claimed in claim 1 wherein said models 
related to the population excluding the client comprises all 
of the population excluding the client. 

3. The method as claimed in claim 1 wherein said models 
are codebooks each of a number of codewords. 

4. The method as claimed in claim 3 wherein said testing 
is effected by assessing a distance between centroids of pairs 
of the codebooks. 

5. The method as claimed in claim 3 wherein said testing 
is effected by assessing a distance between codewords in one 
said codebook and neighbour codewords in another said 
codebook. 

6. The method as claimed in claim 4 wherein the distance 
is a Euclidean distance. 

7. The method of claim 5 wherein the distance is a 
Euclidean distance. 

8. The method of claim 1, further comprising; 

(a) choosing a first model among models of the population 
not including the client model, said first model being 
similar to but still exhibiting significant differences 
with respect to the client model, 

(b) adopting said test model as a first member of the 
cohort, 

(c) testing the remaining models for the population, 
excluding the client and first models, to determine a 
further model, among those of the remaining models 
which have a degree of similarity to the chent model 
similar to that which exists between the first and client 
models, which provides the highest degree of false 
acceptances with respect to the client, 

(d) adding said further model to said cohort, and 

(e) repeating steps (c) and (d) using all models previoTisly 
added to the cohort and the client model to generate 
successive other further models which are added to the 
cohort 

9. The method of claim 1 wherein the models are vector 
quantization or hidden Markov models. 

10. The method of claim 1 wherein said models represent 
speech characteristics. 
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11. The method of claim 1, further comprising comparing 
a model relating to said person with said cohort and deter- 
mining whether the person is the client on the basis of 
similarity of the models relating to the person and to the 
cohort. S 

12. The method of assembling a cohort for a client being 
one of a population, comprising testing whether models 
related to the population excluding the client meet an 
acceptance threshold test as to identity with a model for the 
client, determining, from each model meeting the threshold lO 
test, whether those models are distributed so as to present at 
least a given probability that models for non-members of the 
population spaced from the client model in all directions will 
each be closer to a member of the cohort, excluding the 
client, than to the client and, if that probability is less than 15 
a predetermined value, generating a new model for inclusion 

in the population and which will reduce that probability. 

13. The method as claimed in claim 12 wherein said 
models related to the popxilation excluding the client com- 
prises all of the population excluding the client. 20 

14. The method as claimed in claim 12 wherein said 
models are codebooks each of a number of codewords. 

15. The method as claimed in claim 14 wherein said 
testing is effected by assessing a distance between centroids 
of pairs of the codebooks. 25 

16. A method as claimed in claim 14 wherein said testing 
is effected by assessing a distance between codewords in one 
said codebook and neighbour codewords in another said 
codebook. 

17. The method as claimed in claim 15 wherein the 30 
distance is a Euclidean distance. 

18. The method of claim 16 wherein the distance is a 
Euclidean distance. 

19. The method of claim 14 wherein the new model is 
generated by adding or subtracting a fixed vector displace- 
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ment to the codewords of models in the population exclud- 
ing any generated models. 

20. The method of claim 12 wherein the models are vector 
quantization or hidden Markov models. 

21. The method of claim 12 wherein said models repre- 
sent speech characteristics. 

22. The method of claim 12, further comprising compar- 
ing a model relating to said person with said cohort and 
determining whether the person is the client on the basis of 
similarity of the models relating to the person and to the 
cohort. 

23. A method of assembling a cohort for a client being one 
of a population, comprising testing whether models related 
to the population excluding the client meet an acceptance 
threshold test as to identity with a model for the client, 
determining, from each meeting the threshold test, whether 
those models are distributed so as to present at least a given 
probability that models for non-members of the population 
spaced from the client model in all directions will each be 
closer to a member of the cohort, excluding the client, than 
to the client and, if that probability is less than a predeter- 
mined value, either selecting from the population another 
cohort member which will reduce that probability or gen- 
erating a model for inclusion in the population and which 
will reduce that probability. 

24. The method as claimed in claim 23 wherein the 
models are vector quantisation or hidden Markov models. 

25. The method as claimed in claim 23 wherein said 
models represent speech characteristics. 

26. The method of claim 23, further comprising compar- 
ing a model relating to said person with said cohort and 
determining whether the person is the client on the basis of 
similarity of the models relating to the person and to the 
cohort. 



04/18/2004, EAST Version: 1.4.1 



